{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "463fc59e",
      "metadata": {},
      "source": [
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mongodb-developer/GenAI-Showcase/blob/main/notebooks/rag/rag_chunking_strategies.ipynb)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "55f28a64",
      "metadata": {},
      "source": [
        "# RAG Series Part 3: Choosing the right chunking strategy for RAG\n",
        "\n",
        "In this notebook, we will explore and evaluate different chunking techniques for RAG.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7d5e0bc8",
      "metadata": {},
      "source": [
        "## Step 1: Install required libraries\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "id": "2f403d1b",
      "metadata": {},
      "outputs": [],
      "source": [
        "! pip install -qU langchain langchain-openai langchain-mongodb langchain-experimental ragas pymongo tqdm"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "edfbd26b",
      "metadata": {},
      "source": [
        "## Step 2: Setup pre-requisites\n",
        "\n",
        "- Set the MongoDB connection string. Follow the steps [here](https://www.mongodb.com/docs/manual/reference/connection-string/) to get the connection string from the Atlas UI.\n",
        "\n",
        "- Set the OpenAI API key. Steps to obtain an API key as [here](https://help.openai.com/en/articles/4936850-where-do-i-find-my-openai-api-key)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "id": "3e9fcf67",
      "metadata": {},
      "outputs": [],
      "source": [
        "import getpass\n",
        "import os\n",
        "\n",
        "from openai import OpenAI"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "id": "6dd337af",
      "metadata": {},
      "outputs": [],
      "source": [
        "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter your OpenAI API Key:\")\n",
        "openai_client = OpenAI()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "id": "ba4f45fd",
      "metadata": {},
      "outputs": [],
      "source": [
        "MONGODB_URI = getpass.getpass(\"Enter your MongoDB connection string:\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "32216846",
      "metadata": {},
      "source": [
        "## Step 3: Load the dataset\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "id": "57121b3e",
      "metadata": {},
      "outputs": [],
      "source": [
        "from langchain_community.document_loaders import WebBaseLoader\n",
        "\n",
        "web_loader = WebBaseLoader(\n",
        "    [\n",
        "        \"https://peps.python.org/pep-0483/\",\n",
        "        \"https://peps.python.org/pep-0008/\",\n",
        "        \"https://peps.python.org/pep-0257/\",\n",
        "    ]\n",
        ")\n",
        "\n",
        "pages = web_loader.load()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "id": "255d751e",
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "3"
            ]
          },
          "execution_count": 6,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "len(pages)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "b46573da",
      "metadata": {},
      "source": [
        "## Step 4: Define chunking functions\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 34,
      "id": "defa10d4",
      "metadata": {},
      "outputs": [],
      "source": [
        "from typing import Dict, List, Optional\n",
        "\n",
        "from langchain.text_splitter import (\n",
        "    Language,\n",
        "    RecursiveCharacterTextSplitter,\n",
        "    TokenTextSplitter,\n",
        ")\n",
        "from langchain_core.documents import Document\n",
        "from langchain_experimental.text_splitter import SemanticChunker\n",
        "from langchain_openai.embeddings import OpenAIEmbeddings"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "id": "fa29d3f4",
      "metadata": {},
      "outputs": [],
      "source": [
        "def fixed_token_split(\n",
        "    docs: List[Document], chunk_size: int, chunk_overlap: int\n",
        ") -> List[Document]:\n",
        "    \"\"\"\n",
        "    Fixed token chunking\n",
        "\n",
        "    Args:\n",
        "        docs (List[Document]): List of documents to chunk\n",
        "        chunk_size (int): Chunk size (number of tokens)\n",
        "        chunk_overlap (int): Token overlap between chunks\n",
        "\n",
        "    Returns:\n",
        "        List[Document]: List of chunked documents\n",
        "    \"\"\"\n",
        "    splitter = TokenTextSplitter(\n",
        "        encoding_name=\"cl100k_base\", chunk_size=chunk_size, chunk_overlap=chunk_overlap\n",
        "    )\n",
        "    return splitter.split_documents(docs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "id": "5a62e2c2",
      "metadata": {},
      "outputs": [],
      "source": [
        "def recursive_split(\n",
        "    docs: List[Document],\n",
        "    chunk_size: int,\n",
        "    chunk_overlap: int,\n",
        "    language: Optional[Language] = None,\n",
        ") -> List[Document]:\n",
        "    \"\"\"\n",
        "    Recursive chunking\n",
        "\n",
        "    Args:\n",
        "        docs (List[Document]): List of documents to chunk\n",
        "        chunk_size (int): Chunk size (number of tokens)\n",
        "        chunk_overlap (int): Token overlap between chunks\n",
        "        language (Optional[Language], optional): Language enum name. Defaults to None.\n",
        "\n",
        "    Returns:\n",
        "        List[Document]: List of chunked documents\n",
        "    \"\"\"\n",
        "    separators = [\"\\n\\n\", \"\\n\", \" \", \"\"]\n",
        "\n",
        "    if language is not None:\n",
        "        try:\n",
        "            separators = RecursiveCharacterTextSplitter.get_separators_for_language(\n",
        "                language\n",
        "            )\n",
        "        except (NameError, ValueError):\n",
        "            print(f\"No separators found for language {language}. Using defaults.\")\n",
        "\n",
        "    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
        "        encoding_name=\"cl100k_base\",\n",
        "        chunk_size=chunk_size,\n",
        "        chunk_overlap=chunk_overlap,\n",
        "        separators=separators,\n",
        "    )\n",
        "    return splitter.split_documents(docs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "id": "7f05d15a",
      "metadata": {},
      "outputs": [],
      "source": [
        "def semantic_split(docs: List[Document]) -> List[Document]:\n",
        "    \"\"\"\n",
        "    Semantic chunking\n",
        "\n",
        "    Args:\n",
        "        docs (List[Document]): List of documents to chunk\n",
        "\n",
        "    Returns:\n",
        "        List[Document]: List of chunked documents\n",
        "    \"\"\"\n",
        "    splitter = SemanticChunker(\n",
        "        OpenAIEmbeddings(), breakpoint_threshold_type=\"percentile\"\n",
        "    )\n",
        "    return splitter.split_documents(docs)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "de19adb6",
      "metadata": {},
      "source": [
        "## Step 5: Generate the evaluation dataset\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "01782bd5",
      "metadata": {},
      "outputs": [],
      "source": [
        "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
        "from ragas import RunConfig\n",
        "from ragas.testset.evolutions import multi_context, reasoning, simple\n",
        "from ragas.testset.generator import TestsetGenerator\n",
        "\n",
        "RUN_CONFIG = RunConfig(max_workers=4, max_wait=180)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "id": "04aef624",
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Filename and doc_id are the same for all nodes.                 \n",
            "Generating: 100%|██████████| 10/10 [01:16<00:00,  7.68s/it]\n"
          ]
        }
      ],
      "source": [
        "# Generator with openai models\n",
        "generator_llm = ChatOpenAI(model=\"gpt-3.5-turbo-16k\")\n",
        "critic_llm = ChatOpenAI(model=\"gpt-4\")\n",
        "embeddings = OpenAIEmbeddings()\n",
        "\n",
        "generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)\n",
        "\n",
        "# Set question type distribution\n",
        "distributions = {simple: 0.5, multi_context: 0.4, reasoning: 0.1}\n",
        "\n",
        "testset = generator.generate_with_langchain_docs(\n",
        "    pages, 10, distributions, run_config=RUN_CONFIG\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "id": "2b1ec262",
      "metadata": {},
      "outputs": [],
      "source": [
        "testset = testset.to_pandas()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "id": "d8ea0f63",
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/plain": [
              "10"
            ]
          },
          "execution_count": 18,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "len(testset)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "id": "37b4ade3",
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>question</th>\n",
              "      <th>contexts</th>\n",
              "      <th>ground_truth</th>\n",
              "      <th>evolution_type</th>\n",
              "      <th>metadata</th>\n",
              "      <th>episode_done</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>What is the purpose of the Callable type in Py...</td>\n",
              "      <td>[ items, the\\nfirst is an int, the second is a...</td>\n",
              "      <td>The Callable type in Python's typing module is...</td>\n",
              "      <td>simple</td>\n",
              "      <td>[{'source': 'https://peps.python.org/pep-0483/...</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>What naming convention should be used for type...</td>\n",
              "      <td>[�L’ instead.\\n\\n\\nASCII Compatibility\\nIdenti...</td>\n",
              "      <td>nan</td>\n",
              "      <td>simple</td>\n",
              "      <td>[{'source': 'https://peps.python.org/pep-0008/...</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>What is the recommended approach for implement...</td>\n",
              "      <td>[ations.\\n\\nComparisons to singletons like Non...</td>\n",
              "      <td>When implementing ordering operations with ric...</td>\n",
              "      <td>simple</td>\n",
              "      <td>[{'source': 'https://peps.python.org/pep-0008/...</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Why should blank lines be removed from the beg...</td>\n",
              "      <td>[ fits on a line, place the closing quotes\\non...</td>\n",
              "      <td>Blank lines should be removed from the beginni...</td>\n",
              "      <td>simple</td>\n",
              "      <td>[{'source': 'https://peps.python.org/pep-0257/...</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>What are some ways to declare types in Python?</td>\n",
              "      <td>[class Derived(Base[T_co]):\\n    ...\\n\\n\\nA ty...</td>\n",
              "      <td>Type variables can be declared in unconstraine...</td>\n",
              "      <td>simple</td>\n",
              "      <td>[{'source': 'https://peps.python.org/pep-0483/...</td>\n",
              "      <td>True</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                            question  \\\n",
              "0  What is the purpose of the Callable type in Py...   \n",
              "1  What naming convention should be used for type...   \n",
              "2  What is the recommended approach for implement...   \n",
              "3  Why should blank lines be removed from the beg...   \n",
              "4     What are some ways to declare types in Python?   \n",
              "\n",
              "                                            contexts  \\\n",
              "0  [ items, the\\nfirst is an int, the second is a...   \n",
              "1  [�L’ instead.\\n\\n\\nASCII Compatibility\\nIdenti...   \n",
              "2  [ations.\\n\\nComparisons to singletons like Non...   \n",
              "3  [ fits on a line, place the closing quotes\\non...   \n",
              "4  [class Derived(Base[T_co]):\\n    ...\\n\\n\\nA ty...   \n",
              "\n",
              "                                        ground_truth evolution_type  \\\n",
              "0  The Callable type in Python's typing module is...         simple   \n",
              "1                                                nan         simple   \n",
              "2  When implementing ordering operations with ric...         simple   \n",
              "3  Blank lines should be removed from the beginni...         simple   \n",
              "4  Type variables can be declared in unconstraine...         simple   \n",
              "\n",
              "                                            metadata  episode_done  \n",
              "0  [{'source': 'https://peps.python.org/pep-0483/...          True  \n",
              "1  [{'source': 'https://peps.python.org/pep-0008/...          True  \n",
              "2  [{'source': 'https://peps.python.org/pep-0008/...          True  \n",
              "3  [{'source': 'https://peps.python.org/pep-0257/...          True  \n",
              "4  [{'source': 'https://peps.python.org/pep-0483/...          True  "
            ]
          },
          "execution_count": 19,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "testset.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "23bfd83f",
      "metadata": {},
      "source": [
        "## Step 6: Evaluate chunking strategies\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e18b907c",
      "metadata": {},
      "outputs": [],
      "source": [
        "from langchain_mongodb import MongoDBAtlasVectorSearch\n",
        "from pymongo import MongoClient\n",
        "\n",
        "client = MongoClient(MONGODB_URI, appname=\"devrel.showcase.chunking_strategies\")\n",
        "DB_NAME = \"evals\"\n",
        "COLLECTION_NAME = \"chunking\"\n",
        "ATLAS_VECTOR_SEARCH_INDEX_NAME = \"vector_index\"\n",
        "MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 32,
      "id": "ac29ad2b",
      "metadata": {},
      "outputs": [],
      "source": [
        "def create_vector_store(docs: List[Document]) -> MongoDBAtlasVectorSearch:\n",
        "    \"\"\"\n",
        "    Create MongoDB Atlas vector store\n",
        "\n",
        "    Args:\n",
        "        docs (List[Document]): List of documents to create the vector store\n",
        "\n",
        "    Returns:\n",
        "        MongoDBAtlasVectorSearch: MongoDB Atlas vector store\n",
        "    \"\"\"\n",
        "    vector_store = MongoDBAtlasVectorSearch.from_documents(\n",
        "        documents=docs,\n",
        "        embedding=OpenAIEmbeddings(model=\"text-embedding-3-small\"),\n",
        "        collection=MONGODB_COLLECTION,\n",
        "        index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,\n",
        "    )\n",
        "\n",
        "    return vector_store"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 22,
      "id": "7d534d1a",
      "metadata": {},
      "outputs": [],
      "source": [
        "import nest_asyncio\n",
        "from datasets import Dataset\n",
        "from ragas import evaluate\n",
        "from ragas.metrics import context_precision, context_recall\n",
        "from tqdm import tqdm\n",
        "\n",
        "# Allow nested use of asyncio (used by RAGAS)\n",
        "nest_asyncio.apply()\n",
        "\n",
        "# Disable tqdm locks\n",
        "tqdm.get_lock().locks = []\n",
        "\n",
        "QUESTIONS = testset.question.to_list()\n",
        "GROUND_TRUTH = testset.ground_truth.to_list()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 30,
      "id": "34b66097",
      "metadata": {},
      "outputs": [],
      "source": [
        "def perform_eval(docs: List[Document]) -> Dict[str, float]:\n",
        "    \"\"\"\n",
        "    Perform RAGAS evaluation\n",
        "\n",
        "    Args:\n",
        "        docs (List[Document]): List of documents to create the vector store\n",
        "\n",
        "    Returns:\n",
        "        Dict[str, float]: Dictionary of evaluation metrics\n",
        "    \"\"\"\n",
        "    eval_data = {\n",
        "        \"question\": QUESTIONS,\n",
        "        \"ground_truth\": GROUND_TRUTH,\n",
        "        \"contexts\": [],\n",
        "    }\n",
        "\n",
        "    print(f\"Deleting existing documents in the collection {DB_NAME}.{COLLECTION_NAME}\")\n",
        "    MONGODB_COLLECTION.delete_many({})\n",
        "    print(\"Deletion complete\")\n",
        "    vector_store = create_vector_store(docs)\n",
        "\n",
        "    # Getting relevant documents for questions in the evaluation dataset\n",
        "    print(\"Getting contexts for evaluation set\")\n",
        "    for question in tqdm(QUESTIONS):\n",
        "        eval_data[\"contexts\"].append(\n",
        "            [doc.page_content for doc in vector_store.similarity_search(question, k=3)]\n",
        "        )\n",
        "    # RAGAS expects a Dataset object\n",
        "    dataset = Dataset.from_dict(eval_data)\n",
        "\n",
        "    print(\"Running evals\")\n",
        "    result = evaluate(\n",
        "        dataset=dataset,\n",
        "        metrics=[context_precision, context_recall],\n",
        "        run_config=RUN_CONFIG,\n",
        "        raise_exceptions=False,\n",
        "    )\n",
        "    return result"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 24,
      "id": "26730cfc",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CHUNK SIZE: 100\n",
            "------ Fixed token without overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:01<00:00,  5.22it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:23<00:00,  1.17s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.8583, 'context_recall': 0.7833}\n",
            "------ Fixed token with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:01<00:00,  5.12it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:18<00:00,  1.09it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.9000, 'context_recall': 0.9500}\n",
            "------ Recursive with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.93it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:22<00:00,  1.10s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.9000, 'context_recall': 0.9833}\n",
            "------ Recursive Python splitter with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.90it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:22<00:00,  1.15s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.9833, 'context_recall': 0.9833}\n",
            "CHUNK SIZE: 200\n",
            "------ Fixed token without overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.94it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:21<00:00,  1.09s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.9000, 'context_recall': 0.9000}\n",
            "------ Fixed token with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:01<00:00,  5.10it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:20<00:00,  1.03s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 1.0000, 'context_recall': 0.9383}\n",
            "------ Recursive with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:01<00:00,  5.13it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:22<00:00,  1.12s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.9000, 'context_recall': 0.9008}\n",
            "------ Recursive Python splitter with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.75it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:21<00:00,  1.10s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 1.0000, 'context_recall': 0.8583}\n",
            "CHUNK SIZE: 500\n",
            "------ Fixed token without overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.99it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:22<00:00,  1.11s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.8833, 'context_recall': 0.9500}\n",
            "------ Fixed token with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.77it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.7000, 'context_recall': 0.9000}\n",
            "------ Recursive with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.65it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:20<00:00,  1.02s/it]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.5667, 'context_recall': 0.8236}\n",
            "------ Recursive Python splitter with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:01<00:00,  5.11it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.6000, 'context_recall': 0.8800}\n",
            "CHUNK SIZE: 1000\n",
            "------ Fixed token without overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:01<00:00,  5.18it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:18<00:00,  1.08it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.9000, 'context_recall': 0.8909}\n",
            "------ Fixed token with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.27it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:17<00:00,  1.15it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.7833, 'context_recall': 0.8909}\n",
            "------ Recursive with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:03<00:00,  2.64it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.7833, 'context_recall': 0.8800}\n",
            "------ Recursive Python splitter with overlap ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.64it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:19<00:00,  1.01it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.8000, 'context_recall': 0.8709}\n",
            "------ Semantic chunking ------\n",
            "Deleting existing documents in the collection evals.chunking\n",
            "Deletion complete\n",
            "Getting contexts for evaluation set\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "100%|██████████| 10/10 [00:02<00:00,  4.69it/s]\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running evals\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Evaluating: 100%|██████████| 20/20 [00:23<00:00,  1.16s/it]"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Result: {'context_precision': 0.9000, 'context_recall': 0.8187}\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        }
      ],
      "source": [
        "for chunk_size in [100, 200, 500, 1000]:\n",
        "    chunk_overlap = int(0.15 * chunk_size)\n",
        "    print(f\"CHUNK SIZE: {chunk_size}\")\n",
        "    print(\"------ Fixed token without overlap ------\")\n",
        "    print(f\"Result: {perform_eval(fixed_token_split(pages, chunk_size, 0))}\")\n",
        "    print(\"------ Fixed token with overlap ------\")\n",
        "    print(\n",
        "        f\"Result: {perform_eval(fixed_token_split(pages, chunk_size, chunk_overlap))}\"\n",
        "    )\n",
        "    print(\"------ Recursive with overlap ------\")\n",
        "    print(f\"Result: {perform_eval(recursive_split(pages, chunk_size, chunk_overlap))}\")\n",
        "    print(\"------ Recursive Python splitter with overlap ------\")\n",
        "    print(\n",
        "        f\"Result: {perform_eval(recursive_split(pages, chunk_size, chunk_overlap, Language.PYTHON))}\"\n",
        "    )\n",
        "print(\"------ Semantic chunking ------\")\n",
        "print(f\"Result: {perform_eval(semantic_split(pages))}\")"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.1"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "state": {}
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
